The notebook that this notebook bases on can be found at Homeworks/HW1/PawelPawlik/main.ipynb.
Consider a following model:
f(x1, x2) = (x1 + x2)^2
Assume that x1, x2 ~ U[-1,1] and x1=x2 (full dependency)
Calculate PD profile for variable x1 in this model.
Extra task if you do not fear conditional expected values: Calculate ME and ALE profiles for variable x1 in this model.

AIX360, Alibi dalex, PDPbox; in R: pdp, DALEX, ALEplot. *implement CP yourself for a potential bonus point¶| age | thall |
|---|---|
![]() |
![]() |
I use my own implementation of CP profiles. As we can see on the above plot we can see that categorical variable thall (Thalium Stress Test result ~ (0,3)) has consistent influence on model's prediction:
When we look at the age analysis we can conclude that:
age for one observation and decreasing with age for another one. NOTE that you will need to have a model with interactions to observe such differences.¶
As we could already notice in point 2, the are observations in the dataset that have different CP profiles. On the above plot, there are two observations that have opposite profiles: one is increasing while the other is decreasing with age.
| age | thall |
|---|---|
![]() |
![]() |
![]() |
![]() |
I use my own implementation of PDP. As we can see on the above plots, the explanations between PDP and CP is consistent on the thall variable. However, the explanations differ on the age variable. We can now see clear influence of the age variable on the final prediction. The prediction lowers with age up to ~60 and then rises with age. This is surprising (I would expect more monotonic relation). However, we should be careful before we accept the above explanation because, as we saw on CP analysis, we could expect high corelation between age and other variables (different behaviors of the CP profile between observations).
| age | thall | |
|---|---|---|
| Random Forest | ![]() |
![]() |
| Gradient Boosting | ![]() |
![]() |
| Logistic Regression | ![]() |
![]() |
I compared the PDP explanations between Random Forest, Gradient Boosting and Logistic Regression. As we can see, the explanations for Random Forest and Gradient Boosting are similar. This is not surprising as both os these method are tree-based there is a chance that both of them learned the same relations.
When we compare Logistic Regression with the other two we can see similar effects on thall variable: higher value means lower prediction. However, we lose the one-step lookalike PDP profile on the mentioned variable. Meanwhile, looking at age variable, we can observe that the Logistic Regression model picked clear relation between age and prediction (higher age -> higher prediction). Such a relation is not visible in the case of the other two models so this suggests that the described relation on the actual data may not be true.
Source code
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import RocCurveDisplay, roc_curve
from sklearn.base import ClassifierMixin
import seaborn as sns
from sklearn.metrics import roc_auc_score, accuracy_score
import matplotlib.pyplot as plt
import numpy as np
import shap
import dalex as dx
import random
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
import plotly.io as pio
pio.renderers.default = "notebook"
warnings.simplefilter(action='ignore', category=FutureWarning)
np.random.seed(42)
OUTPUT_COLUMN = "output"
scaler = StandardScaler()
def get_preprocessed_dataset():
df = pd.read_csv("heart.csv")
df = pd.get_dummies(df, columns=["cp", "restecg"])
# scaler = StandardScaler()
X, y = df.drop(columns=[OUTPUT_COLUMN]), df[OUTPUT_COLUMN]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=42
)
X_train[:] = scaler.fit_transform(X_train)
X_test[:] = scaler.transform(X_test)
return X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = get_preprocessed_dataset()
def inverse_scaler(column_values, variable: str):
variable_idx = X_train.columns.get_loc(variable)
return column_values * scaler.scale_[variable_idx] + scaler.mean_[variable_idx]
models = [RandomForestClassifier(), GradientBoostingClassifier(), LogisticRegression()]
y_hats = []
for m in models:
m.fit(X_train, y_train)
y_hats.append(m.predict_proba(X_test))
AIX360, Alibi dalex, PDPbox; in R: pdp, DALEX, ALEplot. *implement CP yourself for a potential bonus point¶class CP:
def __init__(self, model: ClassifierMixin, data: pd.DataFrame) -> None:
self.model = model
self.data = data
def get_df(self, X, variable):
df = pd.DataFrame()
linspace = np.linspace(self.data[variable].min(), self.data[variable].max())
for i in linspace:
X_ = X.copy()
X_[variable] = i
y_hat = self.model.predict_proba(X_)[:, 1]
df = df.append(pd.Series(y_hat), ignore_index=True)
return df.set_index(inverse_scaler(linspace, variable))
def plot(self, X, variable):
fig1 = px.line(self.get_df(X, variable))
fig2 = px.scatter(x=inverse_scaler(X[variable], variable), y=self.model.predict_proba(X)[:, 1])
return go.Figure(data=fig1.data + fig2.data).update_layout(
title=f"Ceteris Paribus profile: {variable}<br>of {self.model}",
xaxis_title="value",
yaxis_title="prediction",
showlegend=False,
)
def test_cp_implementation(model, X=X_test.iloc[:20], variable="age"):
dx_explainer = dx.Explainer(model, X_test, y_test, verbose=False)
dx_cp = dx_explainer.predict_profile(new_observation=X)
dx_cp.plot(variables=[variable])
my_cp = CP(model, X_test)
my_cp.plot(X, variable).show()
test_cp_implementation(models[0])
def task2(model, X=X_test.iloc[:20]):
count = 0
cp = CP(model, X_test)
for variable in ["age", "thall"]:
fig = cp.plot(X, variable)
fig.write_image(f"images/2_{count}.png")
fig.show()
count += 1
task2(models[0])
age for one observation and decreasing with age for another one. NOTE that you will need to have a model with interactions to observe such differences.¶def task3(model, X, variable):
cp = CP(model, X_test)
fig = cp.plot(X, variable)
fig.write_image(f"images/3.png")
fig.show()
task3(models[0], X_test.iloc[[1, 6]], "age")
class PDP:
def __init__(self, model: ClassifierMixin, data: pd.DataFrame) -> None:
self.model = model
self.data = data
def get_df(self, variable):
df = pd.DataFrame()
linspace = np.linspace(self.data[variable].min(), self.data[variable].max())
for i in linspace:
X_ = self.data.copy()
X_[variable] = i
y_hat = self.model.predict_proba(X_)[:, 1]
df = df.append(pd.Series(y_hat), ignore_index=True)
return df.set_index(inverse_scaler(linspace, variable))
def plot(self, variable):
return px.line(self.get_df(variable).mean(axis=1)).update_layout(
title=f"PDP profile: {variable}<br>of {self.model}",
xaxis_title="value",
yaxis_title="prediction",
showlegend=False,
)
def test_pdp_implementation(model, variable="age"):
dx_explainer = dx.Explainer(model, X_test, y_test, verbose=False)
dx_cp = dx_explainer.model_profile()
dx_cp.plot(variables=[variable])
my_pdp = PDP(model, X_test)
my_pdp.plot(variable).show()
test_pdp_implementation(models[0])
def task4(model):
count = 0
pdp = PDP(model, X_test)
for variable in ["age", "thall"]:
fig = pdp.plot(variable)
fig.write_image(f"images/4_{count}.png")
fig.show()
count += 1
task4(models[0])
def task5(models):
count = 0
for model in models:
pdp = PDP(model, X_test)
for variable in ["age", "thall"]:
fig = pdp.plot(variable)
fig.write_image(f"images/5_{count}.png")
fig.show()
count += 1
task5(models)